>>>>>>> Stashed changes
<<<<<<< Updated upstream
Code
import pandas as pd
eda = pd.read_csv('data/eda_data.csv')
job_posts = pd.read_csv("data/lightcast_job_postings.csv")
eda.head()| COMPANY | LOCATION | POSTED | MIN_EDULEVELS_NAME | MAX_EDULEVELS_NAME | MIN_YEARS_EXPERIENCE | MAX_YEARS_EXPERIENCE | TITLE | SKILLS | SPECIALIZED_SKILLS | ... | COMMON_SKILLS | SOFTWARE_SKILLS | SOC_2021_4_NAME | NAICS_2022_6 | NAICS2_NAME | REMOTE_TYPE_NAME | SALARY | TITLE_NAME | SKILLS_NAME | SPECIALIZED_SKILLS_NAME | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 894731.0 | {\n "lat": 33.20763,\n "lon": -92.6662674\n} | 6/2/2024 | Bachelor's degree | Master's degree | 2.000000 | 2.000000 | ET29C073C03D1F86B4 | [\n "KS126DB6T061MHD7RTGQ",\n "KS126706DPFD3... | [\n "KS126DB6T061MHD7RTGQ",\n "KS128006L3V0H... | ... | [\n "KS126706DPFD3354M7YK",\n "KS1280B68GD79... | [\n "KS440W865GC4VRBW6LJP",\n "KS13USA80NE38... | Data Scientists | 441330.0 | Retail Trade | [None] | 116300.0 | Enterprise Analysts | [\n "Merchandising",\n "Mathematics",\n "Pr... | [\n "Merchandising",\n "Predictive Modeling"... |
| 1 | 133098.0 | {\n "lat": 44.3106241,\n "lon": -69.7794897\n} | 6/2/2024 | No Education Listed | Master's degree | 3.000000 | 3.000000 | ET21DDA63780A7DC09 | [\n "KS122626T550SLQ7QZ1C",\n "KS123YJ6KVWC9... | [\n "KS122626T550SLQ7QZ1C",\n "KS123YJ6KVWC9... | ... | [] | [\n "BGSBF3F508F7F46312E3",\n "ESEA839CED378... | Data Scientists | 561320.0 | Administrative and Support and Waste Managemen... | Remote | 116300.0 | Oracle Consultants | [\n "Procurement",\n "Financial Statements",... | [\n "Procurement",\n "Financial Statements",... |
| 2 | 39063746.0 | {\n "lat": 32.7766642,\n "lon": -96.7969879\n} | 6/2/2024 | Bachelor's degree | Master's degree | 5.000000 | 3.773903 | ET3037E0C947A02404 | [\n "KS1218W78FGVPVP2KXPX",\n "ESF3939CE1F80... | [\n "ESF3939CE1F80C10C327",\n "KS120GV6C72JM... | ... | [\n "KS1218W78FGVPVP2KXPX",\n "BGS1ADAA36DB6... | [\n "KS126HY6YLTB9R7XJC4Z"\n] | Data Scientists | 524291.0 | Finance and Insurance | [None] | 116300.0 | Data Analysts | [\n "Management",\n "Exception Reporting",\n... | [\n "Exception Reporting",\n "Data Analysis"... |
| 3 | 37615159.0 | {\n "lat": 33.4483771,\n "lon": -112.0740373\n} | 6/2/2024 | No Education Listed | Master's degree | 3.000000 | 3.773903 | ET2114E0404BA30075 | [\n "KS123QX62QYTC4JF38H8",\n "KS7G6NP6R6L1H... | [\n "KS123QX62QYTC4JF38H8",\n "KS441PQ64HT13... | ... | [\n "KS7G6NP6R6L1H1SKFTSY",\n "KS1218W78FGVP... | [\n "KS4409D76NW1S5LNCL18",\n "ESC7869CF7378... | Data Scientists | 522110.0 | Finance and Insurance | [None] | 116300.0 | Management Analysts | [\n "Exit Strategies",\n "Reliability",\n "... | [\n "Exit Strategies",\n "User Story",\n "H... |
| 4 | 0.0 | {\n "lat": 37.6392595,\n "lon": -120.9970014\n} | 6/2/2024 | No Education Listed | Master's degree | 5.486444 | 3.773903 | ET0000000000000000 | [] | [] | ... | [] | [] | Data Scientists | 999999.0 | Unclassified Industry | [None] | 92500.0 | Unclassified | [] | [] |
5 rows × 21 columns
=======
>>>>>>> Stashed changes
<<<<<<< Updated upstream
Code
# identifying data analyst jobs by keyword searching
keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
'Data Science', 'Data Analysis','Data Analytics', 'Market Research Analyst'
'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']
match = lambda col: eda[col].str.contains('|'.join(keywords), case=False, na=False)
eda['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
| match('SKILLS_NAME') \
| match('SPECIALIZED_SKILLS_NAME')
eda['DATA_ANALYST_JOB'].value_counts()DATA_ANALYST_JOB
False 37043
True 32155
Name: count, dtype: int64
=======
>>>>>>> Stashed changes
<<<<<<< Updated upstream
Code
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# -----------------------------------------------------------------------------
# 1) Prepare your data
# -----------------------------------------------------------------------------
df_grouped = (
eda
.groupby(['DATA_ANALYST_JOB','NAICS2_NAME'])
.size()
.reset_index(name='Job_Count')
)
short_names = {
'Professional, Scientific, and Technical Services': 'Prof. Services',
'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
'Health Care and Social Assistance': 'Healthcare',
'Finance and Insurance': 'Finance',
'Information': 'Info Tech',
'Educational Services': 'Education',
'Manufacturing': 'Manufacturing',
'Retail Trade': 'Retail',
'Accommodation and Food Services': 'Hospitality',
'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_names).fillna(df_grouped['NAICS2_NAME'])
df_grouped['Job_Type'] = df_grouped['DATA_ANALYST_JOB'].map({True:'True', False:'False'})
pivot = (
df_grouped
.pivot_table(index='Industry', columns='Job_Type', values='Job_Count', fill_value=0)
.reset_index()
)
industries = pivot['Industry'].tolist()
y_true = pivot['True'].tolist()
y_false = pivot['False'].tolist()
# -----------------------------------------------------------------------------
# 2) Build a 2-row subplot: bar on top, table below
# -----------------------------------------------------------------------------
fig = make_subplots(
rows=2, cols=1,
row_heights=[0.70, 0.30], # give a bit more room to the table
specs=[[{"type":"bar"}],[{"type":"table"}]],
vertical_spacing=0.12 # more space between bar and table
)
colors = {'True': '#FFE5E5', 'False': '#FF6B6B'}
fig.add_trace(
go.Bar(
x=industries, y=y_true, name='True',
marker=dict(color=colors['True'], line=dict(color='#A81D1D', width=1)),
text=y_true, textposition='outside'
),
row=1, col=1
)
fig.add_trace(
go.Bar(
x=industries, y=y_false, name='False',
marker=dict(color=colors['False'], line=dict(color='#A81D1D', width=1)),
text=y_false, textposition='outside'
),
row=1, col=1
)
fig.add_trace(
go.Table(
header=dict(
values=["Industry","True","False"],
fill_color='#FDEDEC',
align='left',
font=dict(color='#A81D1D', size=13),
height=30
),
cells=dict(
values=[industries, y_true, y_false],
fill_color='white',
align='left',
font=dict(color='#333', size=11),
height=22
)
),
row=2, col=1
)
# -----------------------------------------------------------------------------
# 3) Slider steps: 0 → 8 000 in 200s
# -----------------------------------------------------------------------------
steps = []
for val in range(0, 8001, 200):
steps.append(dict(
label=str(val),
method="update",
args=[
{"y": [
[v if v>=val else 0 for v in y_true],
[v if v>=val else 0 for v in y_false]
]},
{"title": f"Min Jobs ≥ {val:,}"}
]
))
# -----------------------------------------------------------------------------
# 4) Final layout tweaks
# -----------------------------------------------------------------------------
fig.update_layout(
# lift slider above everything
sliders=[dict(
active=0,
currentvalue={"prefix":"Min Jobs: "},
pad={"b":0},
x=0.15,
y=1.18, # move slider way above the plot area
xanchor="left",
yanchor="bottom",
len=0.7,
font=dict(color='#A81D1D'),
steps=steps
)],
title=dict(
text="Data & Business Analytics Job Trends",
font=dict(size=24, color='#A81D1D'),
x=0.5,
y=0.92, # drop the title just below the slider
xanchor="center",
yanchor="top"
),
width=1100, height=850,
margin=dict(l=60, r=60, t=180, b=200), # extra top & bottom margin
plot_bgcolor='white',
paper_bgcolor='white',
xaxis=dict(
title="Industry",
title_font=dict(size=16, color='#A81D1D'),
tickmode='array',
tickvals=list(range(len(industries))),
ticktext=industries,
tickangle=-30,
tickfont=dict(size=11, color='#333'),
showline=True, linecolor='#A81D1D'
),
yaxis=dict(
title="Number of Jobs",
title_font=dict(size=16, color='#A81D1D'),
tickfont=dict(size=11, color='#333'),
gridcolor='rgba(200,200,200,0.3)',
showline=True, linecolor='#A81D1D',
range=[0, max(max(y_true),max(y_false))*1.2]
),
legend=dict(
title="Data Analyst Job",
title_font=dict(color='#A81D1D'),
font=dict(size=12),
x=1.02, y=0.5
),
bargap=0.2
)
fig.show()
=======
>>>>>>> Stashed changes
<<<<<<< Updated upstream
Code
import plotly.express as px
import pandas as pd
# Prepare the data
df = eda.copy()
# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
if row['DATA_ANALYST_JOB']:
return True
title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
if 'business analyst' in title:
return True
return False
df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Create the box plot
fig = px.box(df,
x='REMOTE_TYPE_NAME',
y='SALARY',
color='Job_Category',
title='Salary Distribution by Remote Type for Analytics vs Non-Analytics Jobs',
labels={'REMOTE_TYPE_NAME': 'Remote Type', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'},
color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})
# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
width=900,
height=600,
plot_bgcolor='#FFFFFF', # Plain white background
paper_bgcolor='#FFFFFF', # Plain white background
font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
title=dict(
font=dict(size=24, color="#FF6B6B"), # Red title for theme
x=0.5,
xanchor="center",
y=0.95,
yanchor="top"
),
xaxis=dict(
title="Remote Type",
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor="#E2E8F0",
linecolor="#2D3748",
linewidth=2,
showline=True
),
yaxis=dict(
title="Salary ($)",
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor="#E2E8F0",
linecolor="#2D3748",
linewidth=2,
showline=True,
showgrid=True,
zeroline=False
),
legend=dict(
title="Job Category",
font=dict(size=13),
bgcolor="#FFFFFF",
bordercolor="#FF6B6B", # Red border for theme
borderwidth=1,
x=1.02,
y=0.5,
xanchor="left",
yanchor="middle"
),
hovermode="closest",
hoverlabel=dict(
bgcolor="#FFFFFF",
font_size=12,
font_family="Inter, sans-serif",
font_color="#2D3748",
bordercolor="#FF6B6B" # Red border for hover
)
)
# Show the plot
fig.show()
=======
>>>>>>> Stashed changes
<<<<<<< Updated upstream
Code
import plotly.express as px
import pandas as pd
# Prepare the data
df = eda.copy()
# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
if row['DATA_ANALYST_JOB']:
return True
title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
if 'business analyst' in title:
return True
return False
df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Group by industry and job category
df_grouped = df.groupby(['NAICS2_NAME', 'IS_ANALYTICS_JOB']).size().reset_index(name='Job_Count')
df_grouped['Job_Category'] = df_grouped['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Shorten industry names for better readability
short_map = {
'Professional, Scientific, and Technical Services': 'Prof. Services',
'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
'Health Care and Social Assistance': 'Healthcare',
'Finance and Insurance': 'Finance',
'Information': 'Info Tech',
'Educational Services': 'Education',
'Manufacturing': 'Manufacturing',
'Retail Trade': 'Retail',
'Accommodation and Food Services': 'Hospitality',
'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_map).fillna(df_grouped['NAICS2_NAME'])
# Create the stacked bar chart
fig = px.bar(df_grouped,
x='Industry',
y='Job_Count',
color='Job_Category',
title='Top Industries Hiring Analytics Jobs',
labels={'Industry': 'Industry', 'Job_Count': 'Number of Jobs', 'Job_Category': 'Job Category'},
barmode='stack',
color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})
# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
width=1000,
height=600,
plot_bgcolor='#FFFFFF', # Plain white background
paper_bgcolor='#FFFFFF', # Plain white background
font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
title=dict(
font=dict(size=24, color="#FF6B6B"), # Red title for theme
x=0.5,
xanchor="center",
y=0.95,
yanchor="top"
),
xaxis=dict(
title="Industry",
title_font=dict(size=16),
tickfont=dict(size=12),
tickangle=-45,
gridcolor="#E2E8F0",
linecolor="#2D3748",
linewidth=2,
showline=True
),
yaxis=dict(
title="Number of Jobs",
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor="#E2E8F0",
linecolor="#2D3748",
linewidth=2,
showline=True,
showgrid=True,
zeroline=False
),
legend=dict(
title="Job Category",
font=dict(size=13),
bgcolor="#FFFFFF",
bordercolor="#FF6B6B", # Red border for theme
borderwidth=1,
x=1.02,
y=0.5,
xanchor="left",
yanchor="middle"
),
hovermode="closest",
hoverlabel=dict(
bgcolor="#FFFFFF",
font_size=12,
font_family="Inter, sans-serif",
font_color="#2D3748",
bordercolor="#FF6B6B" # Red border for hover
)
)
# Show the plot
fig.show()
=======
>>>>>>> Stashed changes
<<<<<<< Updated upstream
Code
import plotly.express as px
import pandas as pd
# Prepare the data
df = eda.copy()
# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
if row['DATA_ANALYST_JOB']:
return True
title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
if 'business analyst' in title:
return True
return False
df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2
# Clean the data (remove rows with missing salary or experience)
df = df.dropna(subset=['Avg_Years_Experience', 'SALARY'])
# Create the scatter plot with trend line
fig = px.scatter(df,
x='Avg_Years_Experience',
y='SALARY',
color='Job_Category',
trendline='ols', # Add trend line (ordinary least squares)
title='Experience Requirements vs Salary for Analytics Jobs',
labels={'Avg_Years_Experience': 'Average Years of Experience', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'},
color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})
# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
width=900,
height=600,
plot_bgcolor='#FFFFFF', # Plain white background
paper_bgcolor='#FFFFFF', # Plain white background
font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
title=dict(
font=dict(size=24, color="#FF6B6B"), # Red title for theme
x=0.5,
xanchor="center",
y=0.95,
yanchor="top"
),
xaxis=dict(
title="Average Years of Experience",
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor="#E2E8F0",
linecolor="#2D3748",
linewidth=2,
showline=True,
showgrid=True,
zeroline=False
),
yaxis=dict(
title="Salary ($)",
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor="#E2E8F0",
linecolor="#2D3748",
linewidth=2,
showline=True,
showgrid=True,
zeroline=False
),
legend=dict(
title="Job Category",
font=dict(size=13),
bgcolor="#FFFFFF",
bordercolor="#FF6B6B", # Red border for theme
borderwidth=1,
x=1.02,
y=0.5,
xanchor="left",
yanchor="middle"
),
hovermode="closest",
hoverlabel=dict(
bgcolor="#FFFFFF",
font_size=12,
font_family="Inter, sans-serif",
font_color="#2D3748",
bordercolor="#FF6B6B" # Red border for hover
)
)
# Customize scatter points
fig.update_traces(
marker=dict(
size=8,
opacity=0.7,
line=dict(width=1, color="#2D3748")
)
)
# Show the plot
fig.show()--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[6], line 26 23 df = df.dropna(subset=['Avg_Years_Experience', 'SALARY']) 25 # Create the scatter plot with trend line ---> 26 fig = px.scatter(df, 27 x='Avg_Years_Experience', 28 y='SALARY', 29 color='Job_Category', 30 trendline='ols', # Add trend line (ordinary least squares) 31 title='Experience Requirements vs Salary for Analytics Jobs', 32 labels={'Avg_Years_Experience': 'Average Years of Experience', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'}, 33 color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'}) 35 # Beautify the layout with a red-white theme (no gradients) 36 fig.update_layout( 37 width=900, 38 height=600, (...) 89 ) 90 ) File ~/Documents/Semester2/Web_analytics/ad688-employability-sp25A1-group11/.venv/lib/python3.13/site-packages/plotly/express/_chart_types.py:69, in scatter(data_frame, x, y, color, symbol, size, hover_name, hover_data, custom_data, text, facet_row, facet_col, facet_col_wrap, facet_row_spacing, facet_col_spacing, error_x, error_x_minus, error_y, error_y_minus, animation_frame, animation_group, category_orders, labels, orientation, color_discrete_sequence, color_discrete_map, color_continuous_scale, range_color, color_continuous_midpoint, symbol_sequence, symbol_map, opacity, size_max, marginal_x, marginal_y, trendline, trendline_options, trendline_color_override, trendline_scope, log_x, log_y, range_x, range_y, render_mode, title, subtitle, template, width, height) 14 def scatter( 15 data_frame=None, 16 x=None, (...) 63 height=None, 64 ) -> go.Figure: 65 """ 66 In a scatter plot, each row of `data_frame` is represented by a symbol 67 mark in 2D space. 68 """ ---> 69 return make_figure(args=locals(), constructor=go.Scatter) File ~/Documents/Semester2/Web_analytics/ad688-employability-sp25A1-group11/.venv/lib/python3.13/site-packages/plotly/express/_core.py:2668, in make_figure(args, constructor, trace_patch, layout_patch) 2665 elif args["ecdfnorm"] == "percent": 2666 group = group.with_columns((nw.col(var) / group_sum) * 100.0) -> 2668 patch, fit_results = make_trace_kwargs( 2669 args, trace_spec, group, mapping_labels.copy(), sizeref 2670 ) 2671 trace.update(patch) 2672 if fit_results is not None: File ~/Documents/Semester2/Web_analytics/ad688-employability-sp25A1-group11/.venv/lib/python3.13/site-packages/plotly/express/_core.py:430, in make_trace_kwargs(args, trace_spec, trace_data, mapping_labels, sizeref) 427 trace_patch["x"] = trace_patch["x"].to_numpy() 429 trendline_function = trendline_functions[attr_value] --> 430 y_out, hover_header, fit_results = trendline_function( 431 args["trendline_options"], 432 sorted_trace_data.get_column(args["x"]), # narwhals series 433 x.to_numpy(), # numpy array 434 y.to_numpy(), # numpy array 435 args["x"], 436 args["y"], 437 non_missing.to_numpy(), # numpy array 438 ) 439 assert len(y_out) == len( 440 trace_patch["x"] 441 ), "missing-data-handling failure in trendline code" 442 trace_patch["y"] = y_out File ~/Documents/Semester2/Web_analytics/ad688-employability-sp25A1-group11/.venv/lib/python3.13/site-packages/plotly/express/trendline_functions/__init__.py:42, in ols(trendline_options, x_raw, x, y, x_label, y_label, non_missing) 36 if k not in valid_options: 37 raise ValueError( 38 "OLS trendline_options keys must be one of [%s] but got '%s'" 39 % (", ".join(valid_options), k) 40 ) ---> 42 import statsmodels.api as sm 44 add_constant = trendline_options.get("add_constant", True) 45 log_x = trendline_options.get("log_x", False) ModuleNotFoundError: No module named 'statsmodels'
=======
>>>>>>> Stashed changes
Code
import plotly.graph_objects as go
import pandas as pd
# Prepare the data
df = eda.copy()
# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
if row['DATA_ANALYST_JOB']:
return True
title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
if 'business analyst' in title:
return True
return False
df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Filter for Analytics jobs only
df_analytics = df[df['IS_ANALYTICS_JOB']].copy()
# Clean the data (remove rows with missing industry)
df_analytics = df_analytics.dropna(subset=['NAICS2_NAME'])
# Group by job category and industry to get job counts
df_grouped = df_analytics.groupby(['Job_Category', 'NAICS2_NAME']).size().reset_index(name='Job_Count')
# Shorten industry names for better readability
short_map = {
'Professional, Scientific, and Technical Services': 'Prof. Services',
'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
'Health Care and Social Assistance': 'Healthcare',
'Finance and Insurance': 'Finance',
'Information': 'Info Tech',
'Educational Services': 'Education',
'Manufacturing': 'Manufacturing',
'Retail Trade': 'Retail',
'Accommodation and Food Services': 'Hospitality',
'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['NAICS2_NAME'] = df_grouped['NAICS2_NAME'].map(short_map).fillna(df_grouped['NAICS2_NAME'])
# Prepare data for Sankey Diagram
# Create a list of unique labels (nodes)
labels = list(df_grouped['Job_Category'].unique()) + list(df_grouped['NAICS2_NAME'].unique())
# Create source and target indices
source = [labels.index(job_cat) for job_cat in df_grouped['Job_Category']]
target = [labels.index(industry) for industry in df_grouped['NAICS2_NAME']]
value = df_grouped['Job_Count'].tolist()
# Create the Sankey Diagram
fig = go.Figure(data=[go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color="#2D3748", width=0.5),
label=labels,
color="#FF6B6B" # Red nodes for the theme
),
link=dict(
source=source,
target=target,
value=value,
color="rgba(255, 107, 107, 0.5)" # Semi-transparent red links
)
)])
# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
width=900,
height=600,
plot_bgcolor='#FFFFFF', # Plain white background
paper_bgcolor='#FFFFFF', # Plain white background
font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
title=dict(
text='Distribution of Analytics Job Postings by Industry',
font=dict(size=24, color="#FF6B6B"), # Red title for theme
x=0.5,
xanchor="center",
y=0.95,
yanchor="top"
),
margin=dict(l=20, r=20, t=80, b=20),
)
# Show the plot
fig.show()